The following script is used to analyse the dataset containing phenotypic data (max.specific growth rate, lag phase, biomass and ethanol yield and cell dry weight) of 24 Saccharomyces cerevisiae strains cultivated in 29 conditions (triplicates).

Load libraries

library(tidyverse)
library(ggplot2)
library(ggpubr)
library(ggsci)
library(RColorBrewer)
library(ggridges)
library(stats)
library(gghighlight)
library(ggbeeswarm)
library(uwot)
library(ggpubr)
library(GGally)

Load data

df_strain_long <- readRDS("~/Library/CloudStorage/OneDrive-Chalmers/PhD/Papers/MSB/Github/scripts/df_strain_long_tr.rds")

Plotting strain phenotypes

Visualize data for each strain and phenotype

ggplot(df_strain_long, aes(x=strain, y=value, order=strain_type)) +
  geom_violin(alpha=0.5, color="gray")+
  geom_point(aes(colour = factor(group), alpha=0.9))+
  # geom_boxplot()+
  facet_wrap(~phenotypes,  ncol=1, strip.position = "left", scales ="free_y")+
  scale_color_manual(values = c("acid" = "#00AFBB", "alcohol" = "#3581d8", "hexose" = "#fba465", "pentose" = "#FC4E07", "salt" = "grey", "aldehyde" = "#f2c85b"))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning: Removed 225 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 225 rows containing missing values (`geom_point()`).

Visualize only the means for each group of conditions and for each strain

df_strain <-  df_strain_long %>% pivot_wider(names_from = phenotypes,values_from = value)

#calculating different means based on conditions, strains, groups and replicates
#datasets for plotting
ggdot_mean <- df_strain %>%
  gather(key="phenotypes", value="value", c("umax", "lag", "Yx", "Yp", "CDW")) %>%
  group_by(strain, phenotypes, group ) %>%
  mutate(mean=mean(value,  na.rm=TRUE)) 
head(ggdot_mean)
Sample strain strain_type condition replicate group phenotypes value mean
Glu_65_A1_CENPK7D CENPK7D lab Glu_65 1 hexose umax 0.1443125 0.1085969
Glu_65_A2_CENPK7D CENPK7D lab Glu_65 2 hexose umax 0.1451979 0.1085969
Glu_65_A3_CENPK7D CENPK7D lab Glu_65 3 hexose umax 0.1462409 0.1085969
Glu_30_A4_CENPK7D CENPK7D lab Glu_30 1 hexose umax 0.1469902 0.1085969
Glu_30_A5_CENPK7D CENPK7D lab Glu_30 2 hexose umax 0.1412626 0.1085969
Glu_30_A6_CENPK7D CENPK7D lab Glu_30 3 hexose umax 0.1430554 0.1085969
ggdot_mean2 <- ggdot_mean %>%
  group_by(strain, phenotypes ) %>%
  mutate(mean=mean(value,  na.rm=TRUE)) %>%
  filter(condition=="Glu_30", replicate==1)
head(ggdot_mean2)
Sample strain strain_type condition replicate group phenotypes value mean
Glu_30_A4_CENPK7D CENPK7D lab Glu_30 1 hexose umax 0.1469902 0.0939345
Glu_30_A4_ETRED ETRED ind Glu_30 1 hexose umax 0.1475120 0.1205679
Glu_30_A4_LBCM1001 LBCM1001 LBCM Glu_30 1 hexose umax 0.1484369 0.0795548
Glu_30_A4_LBCM1003 LBCM1003 LBCM Glu_30 1 hexose umax 0.1524228 0.1017694
Glu_30_A4_LBCM1008 LBCM1008 LBCM Glu_30 1 hexose umax 0.1302099 0.0853253
Glu_30_A4_LBCM1013 LBCM1013 LBCM Glu_30 1 hexose umax 0.1566515 0.0970893
#calculating means to compare datapoints
mean_phenotype_group <- df_strain %>%
  gather(key="phenotypes", value="value", c("umax", "lag", "Yx", "Yp", "CDW")) %>%
  group_by(phenotypes, group ) %>%
 dplyr::summarise(mean_value = mean(value, na.rm=TRUE))
## `summarise()` has grouped output by 'phenotypes'. You can override using the
## `.groups` argument.
head(mean_phenotype_group)
phenotypes group mean_value
CDW acid 1.4167610
CDW alcohol 0.5014860
CDW aldehyde 0.9938367
CDW hexose 2.5982082
CDW pentose 1.0163559
CDW salt 1.4869071
phenotype_mean <- ggdot_mean %>%
  group_by(phenotypes ) %>%
  mutate(mean=mean(value,  na.rm=TRUE)) %>%
  filter(strain=="CENPK7D",condition=="Glu_30", replicate==1)
head(phenotype_mean)
Sample strain strain_type condition replicate group phenotypes value mean
Glu_30_A4_CENPK7D CENPK7D lab Glu_30 1 hexose umax 0.1469902 0.0964515
Glu_30_A4_CENPK7D CENPK7D lab Glu_30 1 hexose lag 6.4992669 7.8736138
Glu_30_A4_CENPK7D CENPK7D lab Glu_30 1 hexose Yx 0.1183531 0.1092942
Glu_30_A4_CENPK7D CENPK7D lab Glu_30 1 hexose Yp 0.2602114 0.0670376
Glu_30_A4_CENPK7D CENPK7D lab Glu_30 1 hexose CDW 3.4974000 1.5002589
condition_mean <- ggdot_mean %>%
  group_by(phenotypes, condition) %>%
  mutate(mean=mean(value,  na.rm=TRUE))%>%
  filter(replicate==1,  strain=="CENPK7D")
head(condition_mean)
Sample strain strain_type condition replicate group phenotypes value mean
Glu_65_A1_CENPK7D CENPK7D lab Glu_65 1 hexose umax 0.1443125 0.1425594
Glu_30_A4_CENPK7D CENPK7D lab Glu_30 1 hexose umax 0.1469902 0.1382374
Xyl_36_A7_CENPK7D CENPK7D lab Xyl_36 1 pentose umax 0.0994732 0.0975475
Xyl_16_A10_CENPK7D CENPK7D lab Xyl_16 1 pentose umax 0.1049668 0.1051068
Gal_4.5_B1_CENPK7D CENPK7D lab Gal_4.5 1 hexose umax 0.0940127 0.1093622
Gal_2_B4_CENPK7D CENPK7D lab Gal_2 1 hexose umax 0.0899561 0.0947701
p <- ggdotchart(ggdot_mean, x = "strain", y = "mean",
           group = "strain_type",
           color = "group",                                # Color by groups
           palette = c("#00AFBB","#3581d8","#f2c85b", "#fba465", "#FC4E07", "grey"), # Custom color palette                           # Add segments from y = 0 to dots
           add.params = list(color = "lightgray", size = 2), # Change segment color and size
           dot.size = 2,                                 # Large dot size
           ggtheme = theme_light()                        # ggplot2 theme
)+
  geom_hline(yintercept = 0, linetype = 2, color = "lightgray") +
  facet_grid(vars(phenotypes), scales = "free_y")+
  theme(
    text = element_text(size = 20),
    axis.text = element_text(size = 20),
    axis.title = element_text(size = 20, face = "bold"),
    strip.background = element_rect(fill = "grey"),
    strip.text.x = element_text(size = 20, color = "black", face = "bold"),
    strip.text.y = element_text(size = 20, color = "black", face = "bold")
  )

p2 <- p +
  geom_point(data = ggdot_mean2,aes(x=strain, y=mean), shape=95, size= 7)+
  facet_grid(vars(phenotypes), scales = "free_y")
p2

#ggsave("phenotype_mean.svg", width = 500, height = 300, unit = "mm", dpi = 300, path="~/Library/CloudStorage/OneDrive-Chalmers/PhD/Papers/MSB/Path_to_submission/Final Figures/R_figures")

Create metadata

df_meta <- 
  df_strain_long %>% 
  select(Sample,strain,strain_type,condition,replicate,group) %>% 
  distinct()

PCA

Pre-processing

Let’s begin by log-normalizing the data

df_strain_long %>% 
  mutate(value_log = log10(value+1e-4)) %>% View()

pca_data <- 
  df_strain_long %>% 
  mutate(value_log = log10(value+1e-4)) %>% 
  select(-value) %>% 
  pivot_wider(names_from = phenotypes,values_from = value_log)

head(pca_data)
Sample strain strain_type condition replicate group CDW Yx Yp umax lag
Glu_65_A1_CENPK7D CENPK7D lab Glu_65 1 hexose 0.7409545 -1.0684447 -1.1284004 -0.8403951 0.8330939
Glu_65_A2_CENPK7D CENPK7D lab Glu_65 2 hexose 0.7524249 -1.0599972 -1.0014028 -0.8377407 0.8323042
Glu_65_A3_CENPK7D CENPK7D lab Glu_65 3 hexose 0.7259035 -1.0849855 -0.9420420 -0.8346342 0.8473676
Glu_30_A4_CENPK7D CENPK7D lab Glu_30 1 hexose 0.5437577 -0.9264536 -0.5845068 -0.8324163 0.8128711
Glu_30_A5_CENPK7D CENPK7D lab Glu_30 2 hexose 0.5319513 -0.9365973 -0.6073209 -0.8496655 0.8099530
Glu_30_A6_CENPK7D CENPK7D lab Glu_30 3 hexose 0.5267914 -0.9483274 -0.4836635 -0.8441922 0.7436396

Make data suitable for prcomp function

df_pca <- 
  pca_data %>% 
  select(Sample,CDW,Yx,Yp,umax,lag) %>% 
  as.data.frame()

rownames(df_pca) <- df_pca$Sample

df_pca <- 
  df_pca %>% 
  select(-Sample)

Run PCA

pca1 <- prcomp(na.omit(df_pca),center = T,scale. = F)

pca1$x %>% head()
##                          PC1         PC2       PC3         PC4         PC5
## Glu_65_A1_CENPK7D -0.4955068  0.19883285 0.5328749 -0.14936278 -0.10156267
## Glu_65_A2_CENPK7D -0.6015676  0.13087592 0.5505046 -0.16179786 -0.10411027
## Glu_65_A3_CENPK7D -0.6254141  0.06681808 0.5520449 -0.17399608 -0.08783624
## Glu_30_A4_CENPK7D -0.9447666 -0.15021062 0.3712173 -0.12596684 -0.03088934
## Glu_30_A5_CENPK7D -0.9163720 -0.15087262 0.3607644 -0.12263541 -0.04279616
## Glu_30_A6_CENPK7D -1.0112277 -0.23511007 0.3865725 -0.06899068 -0.05846269

Store PCA results and merge with metadata

df_pcares <- 
  pca1$x %>% 
  as_tibble(rownames = "Sample") %>% 
  select(Sample,PC1,PC2)
head(df_pcares)
Sample PC1 PC2
Glu_65_A1_CENPK7D -0.4955068 0.1988329
Glu_65_A2_CENPK7D -0.6015676 0.1308759
Glu_65_A3_CENPK7D -0.6254141 0.0668181
Glu_30_A4_CENPK7D -0.9447666 -0.1502106
Glu_30_A5_CENPK7D -0.9163720 -0.1508726
Glu_30_A6_CENPK7D -1.0112277 -0.2351101
df_pca_res <- 
  df_pcares %>% 
  full_join(df_meta,by = "Sample")

df_pca_res %>% 
  ggplot(aes(x=PC1,y=PC2,color=group)) +
  geom_point() +
  stat_ellipse() +
  theme_bw() +
  theme(aspect.ratio = 1)
## Warning: Removed 225 rows containing non-finite values (`stat_ellipse()`).
## Warning: Removed 225 rows containing missing values (`geom_point()`).

df_pca_res %>% 
  ggplot(aes(x=PC1,y=PC2,color=strain_type)) +
  geom_point() +
  stat_ellipse() +
  theme_bw() +
  theme(aspect.ratio = 1)
## Warning: Removed 225 rows containing non-finite values (`stat_ellipse()`).
## Removed 225 rows containing missing values (`geom_point()`).

Check outliers

outlier_samples <- 
  df_pca_res %>% 
  filter(PC1>5 & PC2<0) %>% 
  pull(Sample)

df_strain_long %>% 
  filter(Sample %in% outlier_samples) %>% 
  pivot_wider(names_from = phenotypes,values_from = value) %>% head()
Sample strain strain_type condition replicate group

UMAP

Run UMAP on log-normalized data and store output

set.seed(1980)
umap1 <- umap(na.omit(df_pca),n_components = 3)

df_umap_res <-
  umap1 %>%
  as_tibble(rownames = "Sample") %>%
  rename("UMAP1" = "V1") %>%
  rename("UMAP2" = "V2") %>%
  rename("UMAP3" = "V3") %>%
  full_join(df_meta, by = "Sample")
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Plot UMAP

df_umap_res %>% 
  ggplot(aes(x=UMAP1,y=UMAP2,color=group)) +
  geom_point() +
  stat_ellipse() +
  theme_bw() +
  theme(aspect.ratio = 1)
## Warning: Removed 225 rows containing non-finite values (`stat_ellipse()`).
## Warning: Removed 225 rows containing missing values (`geom_point()`).

plotly::plot_ly(df_umap_res, x = ~UMAP1, y = ~UMAP2, z = ~UMAP3, color = df_umap_res$group, colors = c("#00AFBB","#3581d8","#f2c85b", "#fba465", "#FC4E07", "grey")) 
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: Ignoring 225 observations
df_umap_res %>% 
  ggplot(aes(x=UMAP1,y=UMAP2,color=strain_type)) +
  geom_point() +
  stat_ellipse() +
  theme_void() +
  theme(aspect.ratio = 1) +
  ggtitle("UMAP")
## Warning: Removed 225 rows containing non-finite values (`stat_ellipse()`).
## Removed 225 rows containing missing values (`geom_point()`).

Violin plots

By group

df_strain_long %>% 
  mutate(sugar = group) %>% 
  select(-group) %>% 
  ggplot(aes(x=sugar,y=value, fill = sugar)) +
  geom_jitter()
## Warning: Removed 225 rows containing missing values (`geom_point()`).

df_strain_long %>% 
  ggplot(aes(x=group,y=value, fill = group)) +
  geom_violin(draw_quantiles = 0.5) +
    scale_fill_manual(values=c("#00AFBB","#3581d8","#f2c85b", "#fba465", "#FC4E07", "grey"))+
 facet_grid(vars(phenotypes), scales = "free_y")+
  theme_light()+
  theme(
    text = element_text(size = 20),
    axis.text = element_text(size = 20),
    axis.title = element_text(size = 20, face = "bold"),
    strip.background = element_rect(fill = "grey"),
    strip.text.x = element_text(size = 20, color = "black", face = "bold"),
    strip.text.y = element_text(size = 20, color = "black", face = "bold")
  )
## Warning: Removed 225 rows containing non-finite values (`stat_ydensity()`).

#ggsave("groups_phenotype_distribution.svg", width = 500, height = 300, unit = "mm", dpi = 300, path="~/Library/CloudStorage/OneDrive-Chalmers/PhD/Papers/MSB/Path_to_submission/Final Figures/R_figures")

#Evaluate if the groups are statistically different
cm <- compare_means(value ~ group,  data = df_strain_long, ref.group = "hexose",
              method = "wilcox.test", group.by = "phenotypes")

By condition

df_strain_long %>% 
  mutate(condition = factor(condition,levels = c("Glu_20",
                                                 "Glu_30",
                                                 "Glu_65",
                                                 "Xyl_16",
                                                 "Xyl_36",
                                                 "Gal_2",
                                                 "Gal_4.5",
                                                 "Ara_2",
                                                 "Ara_4",
                                                 "Man_10",
                                                 "Man_16",
                                                 "EtOH_45",
                                                 "EtOH_90",
                                                 "NaCl_25",
                                                 "NaCl_80",
                                                 "FrAcid_1",
                                                 "FrAcid_3.5",
                                                 "AcAcid_4.5",
                                                 "AcAcid_6",
                                                 "LacAcid_2",
                                                 "LacAcid_7",
                                                 "LevAcid_2.5",
                                                 "LevAcid_5",
                                                 "HMF_0.5",
                                                 "HMF_6",
                                                 "Furf_1",
                                                 "Furf_3",
                                                 "Van_0.5",
                                                 "Van_2"),ordered = T)) %>% 
  ggplot(aes(x=condition,y=value, fill = condition)) +
  geom_violin() +
  facet_wrap(vars(phenotypes),ncol = 1,scales = "free_y") +
  theme_bw() +
  theme(aspect.ratio = 0.2,
        axis.text.x = element_text(angle = 45,hjust = 1)) +
  ggtitle("All phenotypes",subtitle = "Grouped by strain")+
  stat_compare_means(ref.group = "Glu_20",label = "p.signif",hide.ns = T)
## Warning: Removed 225 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 225 rows containing non-finite values
## (`stat_compare_means()`).

Performance correlations

#run correlations for all the combinations of phenotypes
ggpairs(df_strain, columns = c(7:11),ggplot2::aes(colour=group), palette())
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 225 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 225 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 225 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 225 rows containing missing values
## Warning: Removed 225 rows containing missing values (`geom_point()`).
## Removed 225 rows containing missing values (`geom_point()`).
## Removed 225 rows containing missing values (`geom_point()`).
## Removed 225 rows containing missing values (`geom_point()`).
## Warning: Removed 225 rows containing non-finite values (`stat_density()`).

#visualize specific correlations with only two phenotypes per time
#ggscatter by phenotype
ggscatter(df_strain, x = 'umax', y = 'CDW', color = 'group', add = "reg.line", conf.int = FALSE, cor.method = "spearman", size=0.5,  palette = c("#00AFBB","#3581d8","#f2c85b", "#fba465", "#FC4E07", "grey"))+
  stat_cor(aes(color = group)) +
  yscale("log10", .format = TRUE)+
  # xscale("log10", .format = TRUE)+
  stat_cor(method = "spearman")+
  theme_light()+
  theme(
    text = element_text(size = 7),
    axis.text.x = element_text(size = 7, angle=90, vjust = 0.5,hjust = 1),
    axis.text.y = element_text(size = 7),
    axis.title = element_text(size = 7, face = "bold"),
    strip.background = element_rect(fill = "grey"),
    strip.text.x = element_text(size = 7, color = "black", face = "bold"),
    strip.text.y = element_text(size = 7, color = "black", face = "bold"))
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Warning: Removed 225 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 225 rows containing non-finite values (`stat_cor()`).
## Removed 225 rows containing non-finite values (`stat_cor()`).

#ggsave("umax_CDW_corr.svg", width = 150, height = 110, unit = "mm", dpi = 300, path="~/Library/CloudStorage/OneDrive-Chalmers/PhD/Papers/MSB/Path_to_submission/Final Figures/R_figures")